In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[1]:
In [2]:
%matplotlib inline
In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import itertools
import matplotlib.pyplot as plt

from bson.objectid import ObjectId

from sgmtradingcore.analytics.metrics import flat_capital_metrics
from stratagemdataprocessing.dbutils.mongo import MongoPersister
In [4]:
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16, 8)
In [5]:
db = MongoPersister.init_from_config('backtesting', auto_connect=True)
In [6]:
name = 'coint'
desc = 'cochrane_orcutt'
code = 'ft12_ftps.nba'
mnemonic = 'ts.mmp_entryexit_barebones'
trading_id = '562f5bef497aee1c22000001'
config_id = '5a02ec4c9316de1207962d17'
In [7]:
print 'Strategy name:', name
print 'Strategy desc:', desc
print 'Strategy code:', code
print 'Mnemonic:     ', mnemonic
Strategy name: coint
Strategy desc: cochrane_orcutt
Strategy code: ft12_ftps.nba
Mnemonic:      ts.mmp_entryexit_barebones
In [8]:
from pprint import pprint
pprint(db['strategy_configurations'].find_one({'_id': ObjectId(config_id)}))
{u'_id': ObjectId('5a02ec4c9316de1207962d17'),
 u'params': {u'is_template_strategy': True,
             u'risk_controller': {u'component_name': u'PassAllRiskController',
                                  u'config': {},
                                  u'config_name': u'default'},
             u'scope_provider': {u'component_name': u'BasketballEventTradingScopeProvider',
                                 u'config': {u'competitions': [u'NBA']},
                                 u'config_name': u'nba'},
             u'signals_generators': [{u'component_name': u'CointegrationSignalGenerator',
                                      u'config': {u'interpolation_resolution': 10.0,
                                                  u'lookback_horizon': 1200.0,
                                                  u'pair_type': u'ft12_ftps',
                                                  u'trade_frequency': 60.0},
                                      u'config_name': u'ft12_ftps'}],
             u'sizers': {u'CointegrationTrader': {u'component_name': u'BasicSizer',
                                                  u'config': {u'capital_fraction': 0.01},
                                                  u'config_name': u'fractional'}},
             u'sport': u'basketball',
             u'sport_filter': [6],
             u'strategy_code': u'ft12_ftps.nba',
             u'strategy_desc': u'cochrane_orcutt',
             u'strategy_name': u'coint',
             u'style': u'in-play',
             u'subscription_after_delta': u'06:00:00:000000',
             u'subscription_before_delta': u'01:00:00:000000',
             u'subscription_timedelta': u'06:00:00:001000',
             u'traders': [{u'component_name': u'CointegrationTrader',
                           u'config': {u'max_holding_period': 0.0,
                                       u'min_holding_period': 0.0,
                                       u'odds_bounds': [1.05, 5.0],
                                       u'stop_loss': 0.0,
                                       u'take_profit': 0.0},
                           u'config_name': u'foo'}],
             u'trading_user_id': u'562f5bef497aee1c22000001'},
 u'sha256': u'1e5d52819199c4665db8755687f3a495feb851cd38fbea94d8491427c414a601',
 u'strategy_code': u'ft12_ftps.nba',
 u'strategy_desc': u'cochrane_orcutt',
 u'strategy_name': u'coint',
 u'update_dt': datetime.datetime(2017, 11, 8, 11, 36, 44, 73000, tzinfo=<bson.tz_util.FixedOffset object at 0x7fe8026b8e50>)}
In [9]:
def to_dataframe(orders):
    settled = filter(lambda o: o['status_str'] == 'SETTLED', orders)

    cols = ['placed_time', 'pnl', 'size', 'bet_side', 'price', 'date_day', 'event_id', 'sticker', 'details']
    rcols = {'placed_time': 'dt', 'size': 'stake', 'price': 'odds', 'date_day': 'date'}

    df = pd.DataFrame(settled, columns=cols).rename(columns=rcols)

    df['is_back'] = (df['bet_side'] == 'back')
    df['capital'] = 10000
    df['reason'] = df['details'].apply(lambda d: d.get('reason'))
    df['pair_key'] = df['details'].apply(lambda d: tuple(d['pair_key']))
    df['portfolio_id'] = df['details'].apply(lambda d: d['portfolio_id'])
    df['plausible_returns'] = df['details'].apply(lambda d: d.get('plausible_returns', []))
    df['weight'] = df['details'].apply(lambda d: None if d['trade_intention']['name'] != 'OpenTradeSingleSticker' else d['signals'][0]['value'][d['trade_intention']['sticker']])

    del df['details']

    return df

rows = list(db['strategy_results'].find({
    'strategy_name': name,
    'strategy_desc': desc,
    'strategy_code': code,
#     'trading_user_id': trading_id,
    'mnemonic': mnemonic,
    'config_id': config_id
}))

df = pd.concat([
    to_dataframe(list(db['orders'].find({
        'strategy_result_id': str(r['_id'])
    }))) for r in rows if r['n_orders'] > 0
])

df['month'] = df.dt.dt.month

Metrics:

In [12]:
flat_capital_metrics(df, groupby='month').T
Out[12]:
month 1 2 3 4 5 10 11 12
n_trades 7890.000000 5593.000000 8592.000000 5796.000000 664.000000 1492.000000 9841.000000 8251.000000
n_win 4113.000000 2826.000000 4326.000000 2923.000000 330.000000 738.000000 4915.000000 4061.000000
n_loss 3777.000000 2767.000000 4266.000000 2873.000000 334.000000 754.000000 4926.000000 4190.000000
hit_ratio 0.521293 0.505274 0.503492 0.504313 0.496988 0.494638 0.499441 0.492183
average_trade_win 0.003620 0.003673 0.003787 0.003951 0.003860 0.003990 0.003754 0.003856
average_trade_loss 0.004883 0.004627 0.004582 0.004927 0.004508 0.004621 0.004666 0.004592
unitary_stake_return -0.063892 -0.103149 -0.102551 -0.053198 -0.124372 -0.087892 -0.121749 -0.100390
cr_trade 0.807229 0.810837 0.838041 0.815885 0.845953 0.844955 0.802824 0.813739
cr_day NaN 0.000288 0.000636 0.005030 0.066088 NaN 0.007092 NaN
cum_return -3.555397 -2.421873 -3.165928 -2.606194 -0.231943 -0.540261 -4.531942 -3.584114
volatility (not annualised) 0.615232 0.318575 0.347177 0.803480 0.094642 0.131584 0.810266 0.420650
sharpe_ratio -5.778949 -7.602213 -9.119055 -3.243631 -2.450733 -4.105827 -5.593152 -8.520408
maximum_drawdown -3.504600 -2.287268 -3.019558 -2.396229 -0.236488 -0.382441 -4.496827 -3.526496
drawdown_duration (days) 30.000000 27.000000 30.000000 29.000000 20.000000 5.000000 29.000000 30.000000
maximum_runup 0.000000 0.000902 -0.003632 -0.006826 0.010076 0.000000 -0.010385 0.000000
runup_duration (days) 0.000000 1.000000 1.000000 1.000000 1.000000 0.000000 1.000000 0.000000
total_pnl -35553.970493 -24218.732766 -31659.278625 -26061.939352 -2319.426721 -5402.607490 -45319.420216 -35841.138088
n_trading_days 31.000000 22.000000 31.000000 29.000000 16.000000 6.000000 30.000000 31.000000
In [19]:
flat_capital_metrics(df, groupby='date')['total_pnl'].hist()
plt.title('Distribution of total daily PnL.')
plt.xlabel('PnL [GBp]')
_ = plt.ylabel('Frequency [%]')
In [20]:
flat_capital_metrics(df, groupby='event_id')['cum_return'].hist()
plt.title('Distribution of returns per event.')
plt.xlabel('Return [%]')
_ = plt.ylabel('Frequency [-]')

Portfolio-based analysis:

Let a portfolio, $\Pi_i$, be the set of trades associated with a given signal to take a position on $n > 0$ markets. Each portfolio will typically comprise $2n$ trades (for each market we have one trade to open and one to close), though it need only have 1 actual trade to be valid. We define the following properties:

  • PnL - The total accumulated profit and loss for the portfolio, starting from the opening time, $t_0^{(i)}$, up to the closing time, $T^{(i)}$.
  • Age - The total time (in seconds) that the portfolio was held open in the $n$ markets: $T^{(i)} - t_0^{(i)}$.
  • Reason - The cause for closing the portfolio.
In [33]:
pdf = pd.DataFrame(columns=['portfolio_id', 'pnl', 'age', 'reason']).set_index('portfolio_id')
gps = df.groupby('portfolio_id')

for g in gps.groups:
    gp = gps.get_group(g).sort_values('dt')
    pdf.loc[g] = [gp.pnl.sum(), (gp.iloc[-1]['dt'] - gp.iloc[0]['dt']).total_seconds(), gp.iloc[-1].reason]

pdf['age'] = pdf['age'].astype(np.int)
pdf.loc[pd.isnull(pdf.reason), 'reason'] = 'unclosed'
/home/tspooner/.venv/st/lib/python2.7/site-packages/ipykernel_launcher.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
In [42]:
pdf.describe().T
Out[42]:
count mean std min 25% 50% 75% max
pnl 10471.0 -19.709341 85.970710 -5214.745273 -29.476173 -12.806905 -3.357301 255.6875
age 10471.0 933.079458 862.379637 0.000000 363.000000 723.000000 1263.000000 6847.0000
In [43]:
print 'Minimum PnL:'
pdf[pdf.pnl == pdf.pnl.min()]
Minimum PnL:
Out[43]:
pnl age reason
portfolio_id
5a031a579316de3788b5cbe7 -5214.745273 2283 portfolio_conflict
In [44]:
print 'Maxmimum PnL:'
pdf[pdf.pnl == pdf.pnl.max()]
Maxmimum PnL:
Out[44]:
pnl age reason
portfolio_id
5a0313899316de1d914fbae7 255.6875 0 unclosed
In [66]:
pnl_out_lim = pdf.pnl.quantile(0.005)
In [68]:
axes = pdf[pdf.pnl > pnl_out_lim][['pnl', 'age']].hist()[0]
plt.suptitle('Distribution of portfolio age and PnL (exclusing extremes).')

axes[0].set_xlabel('Age [s]')
axes[0].set_ylabel('Frequency [-]')

axes[1].set_xlabel('PnL [GBp]')
_ = axes[1].set_ylabel('Frequency [-]')
In [69]:
sns.distplot(pdf[pdf.pnl > pnl_out_lim][pdf.age < pdf.age.quantile(0.25)]['pnl'])
sns.distplot(pdf[pdf.pnl > pnl_out_lim][pdf.age > pdf.age.quantile(0.75)]['pnl'])
plt.legend(['Age < q25', 'Age > q75'])

plt.title('Distribution of portfolio PnLs for the upper and lower quantiles on age (exclusing extremes).')
plt.xlabel('PnL [GBp]')
_ = plt.ylabel('Frequency [-]')
/home/tspooner/.venv/st/lib/python2.7/site-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.
/home/tspooner/.venv/st/lib/python2.7/site-packages/ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  
In [70]:
df.groupby('reason')['pnl'].describe().T
Out[70]:
reason                   
portfolio_conflict  count    27177.000000
                    mean        -7.612720
                    std         75.540715
                    min      -5462.290642
                    25%        -33.048000
                    50%         -0.123239
                    75%         18.050765
                    max       1514.954000
Name: pnl, dtype: float64
In [79]:
f, ax = plt.subplots(1)
gps = pdf[pdf.reason.values != None][pdf.pnl > pnl_out_lim].groupby('reason')

for gid, gp in gps:
    gp.pnl.hist(alpha=0.6, label=gid)

_ = plt.legend()
In [81]:
sns.lmplot("age", "pnl", data=pdf, hue="reason", fit_reg=False, aspect=1.8)
plt.title('Distribution of portfolio PnLs as a function of age.')
plt.xlabel('Age [s]')
_ = plt.ylabel('PnL [GBp]')

Best/worst case analysis:

In [82]:
def get_ret(gp, f=lambda vs: vs[-1]):
    prs = gp.sort_values('dt')['plausible_returns'].values
    lens = map(len, prs)
    ix = np.argmax(lens)

    if any(l > 0 for l in lens):
        return f(prs[ix])
    else:
        return 0.0
In [83]:
get_best_ret = lambda gp: get_ret(gp, lambda vs: max(vs))
get_worst_ret = lambda gp: get_ret(gp, lambda vs: min(vs))
In [85]:
gps = df.groupby('portfolio_id')
actual_rets = gps.apply(get_ret)
best_rets = gps.apply(get_best_ret)
worst_rets = gps.apply(get_worst_ret)
In [93]:
plt.scatter(actual_rets, best_rets-worst_rets)
plt.xlabel('$r_{actual}$ [-]')
plt.ylabel('$r_{best} - r_{worst}$ [-]')
_ = plt.title('Relationship between realised return and the range of possible returns during the portfolios life.', y=1.02)
In [94]:
plt.scatter(best_rets, worst_rets)
plt.xlabel('$r_{best}$ [-]')
plt.ylabel('$r_{worst}$ [-]')
_ = plt.title('Relationship between best and worst returns that could have been realised.', y=1.02)
In [96]:
(best_rets-actual_rets).hist(alpha=0.6)
(best_rets-worst_rets).hist(alpha=0.6)
(actual_rets-worst_rets).hist(alpha=0.6)

plt.xlabel('$r$ [-]')
plt.ylabel('Frequency [-]')
plt.title('Distribution of various differences between best/actual/worst possible returns.', y=1.02)
_ = plt.legend(['$r_{best} - r_{actual}$', '$r_{best} - r_{worst}$', '$r_{actual} - r_{worst}$'], loc='best', prop={'size': 20})

Time series:

In [97]:
def load_df(s):
    from sgmarb.backtesting.data import clean_dataframe
    from stratagemdataprocessing.bookmakers.common.odds.cache import HistoricalOddsCache

    hoc = HistoricalOddsCache(parse=False)
    sdf = clean_dataframe(hoc.get('%s.BF' % s), min_matched=0)
    if sdf is not None:
        sdf['timestamp'] = pd.to_datetime(sdf.timestamp, unit='ms')
        return sdf.set_index('timestamp')
    else:
        return None
In [98]:
def do_plot(pk):
    edf = df[df.pair_key == pk].sort_values('dt')
    stickers = edf['sticker'].unique()

    sdfs = {s: load_df(s) for s in stickers}
    sdfs = {s: df for (s, df) in sdfs.iteritems() if df is not None}

    f, axes = plt.subplots(len(sdfs), sharex=True, figsize=(16, 4*len(sdfs)))
    plt.suptitle(pk)

    for i, (s, sdf) in enumerate(sdfs.iteritems()):
        mp = (sdf['bp1'] + sdf['lp1']) / 2.0
        axes[i].plot(sdf.index, mp, drawstyle='steps-post', color='k', alpha=0.5)
        axes[i].set_title(s)
        axes[i].set_ylim([1.0, min(10.0, mp.max()*1.1)])

        for pid, p in edf[edf.sticker == s].groupby('portfolio_id'):
            open_dt = p['dt'].min()
            close_dt = p['dt'].max()

            if p[p.dt == open_dt].iloc[0].is_back:
                axes[i].axvspan(open_dt, close_dt, color='red', alpha=0.1, label='Back')
            else:
                axes[i].axvspan(open_dt, close_dt, color='blue', alpha=0.1, label='Lay')
In [99]:
for pk in df.pair_key.unique()[:50]:
    do_plot(pk)
/home/tspooner/.venv/st/local/lib/python2.7/site-packages/matplotlib/pyplot.py:424: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)